#Загрузка библиотек

library(dplyr)
## 
## Присоединяю пакет: 'dplyr'
## Следующие объекты скрыты от 'package:stats':
## 
##     filter, lag
## Следующие объекты скрыты от 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
theme_set(theme_minimal())
library(ggpubr)
## Warning: пакет 'ggpubr' был собран под R версии 4.3.2
library(plotly)
## Warning: пакет 'plotly' был собран под R версии 4.3.2
## 
## Присоединяю пакет: 'plotly'
## Следующий объект скрыт от 'package:ggplot2':
## 
##     last_plot
## Следующий объект скрыт от 'package:stats':
## 
##     filter
## Следующий объект скрыт от 'package:graphics':
## 
##     layout
library(rstatix)
## Warning: пакет 'rstatix' был собран под R версии 4.3.2
## 
## Присоединяю пакет: 'rstatix'
## Следующий объект скрыт от 'package:stats':
## 
##     filter

#Задание 1

df = readRDS("life_expectancy_data.RDS")
summary(df)
##    Country               Year         Gender          Life expectancy
##  Length:195         Min.   :2019   Length:195         Min.   :55.49  
##  Class :character   1st Qu.:2019   Class :character   1st Qu.:70.02  
##  Mode  :character   Median :2019   Mode  :character   Median :77.55  
##                     Mean   :2019                      Mean   :75.52  
##                     3rd Qu.:2019                      3rd Qu.:80.95  
##                     Max.   :2019                      Max.   :88.10  
##   Unemployment    Infant Mortality      GDP                 GNI           
##  Min.   : 0.178   Min.   : 1.40    Min.   :1.884e+08   Min.   :3.754e+08  
##  1st Qu.: 3.735   1st Qu.: 5.35    1st Qu.:1.117e+10   1st Qu.:1.094e+10  
##  Median : 5.960   Median :13.50    Median :3.967e+10   Median :4.009e+10  
##  Mean   : 8.597   Mean   :19.61    Mean   :4.660e+11   Mean   :4.864e+11  
##  3rd Qu.:10.958   3rd Qu.:30.23    3rd Qu.:2.476e+11   3rd Qu.:2.457e+11  
##  Max.   :36.442   Max.   :75.80    Max.   :2.143e+13   Max.   :2.171e+13  
##  Clean fuels and cooking technologies   Per Capita      
##  Min.   :  0.00                       Min.   :   228.2  
##  1st Qu.: 34.50                       1st Qu.:  2165.3  
##  Median : 80.70                       Median :  6624.8  
##  Mean   : 65.98                       Mean   : 16821.0  
##  3rd Qu.:100.00                       3rd Qu.: 19439.7  
##  Max.   :100.00                       Max.   :175813.9  
##  Mortality caused by road traffic injury Tuberculosis Incidence
##  Min.   : 0.00                           Min.   :  0.0         
##  1st Qu.: 8.20                           1st Qu.: 12.0         
##  Median :16.00                           Median : 46.0         
##  Mean   :17.06                           Mean   :103.8         
##  3rd Qu.:24.00                           3rd Qu.:138.5         
##  Max.   :64.60                           Max.   :654.0         
##  DPT Immunization HepB3 Immunization Measles Immunization Hospital beds   
##  Min.   :35.00    Min.   :35.00      Min.   :37.00        Min.   : 0.200  
##  1st Qu.:85.69    1st Qu.:81.31      1st Qu.:84.85        1st Qu.: 1.301  
##  Median :92.00    Median :91.00      Median :92.00        Median : 2.570  
##  Mean   :87.99    Mean   :86.76      Mean   :87.31        Mean   : 2.997  
##  3rd Qu.:97.00    3rd Qu.:96.00      3rd Qu.:96.50        3rd Qu.: 3.773  
##  Max.   :99.00    Max.   :99.00      Max.   :99.00        Max.   :13.710  
##  Basic sanitation services Tuberculosis treatment Urban population
##  Min.   :  8.632           Min.   :  0.00         Min.   : 13.25  
##  1st Qu.: 62.919           1st Qu.: 73.00         1st Qu.: 41.92  
##  Median : 91.144           Median : 82.00         Median : 58.76  
##  Mean   : 77.380           Mean   : 77.57         Mean   : 59.12  
##  3rd Qu.: 98.582           3rd Qu.: 88.00         3rd Qu.: 78.02  
##  Max.   :100.000           Max.   :100.00         Max.   :100.00  
##  Rural population Non-communicable Mortality  Sucide Rate        continent 
##  Min.   : 0.00    Min.   : 4.40              Min.   : 0.300   Africa  :52  
##  1st Qu.:21.98    1st Qu.:11.85              1st Qu.: 2.050   Americas:38  
##  Median :41.24    Median :17.20              Median : 3.500   Asia    :42  
##  Mean   :40.88    Mean   :17.05              Mean   : 4.802   Europe  :48  
##  3rd Qu.:58.08    3rd Qu.:22.10              3rd Qu.: 6.600   Oceania :15  
##  Max.   :86.75    Max.   :43.70              Max.   :30.100

#Задание 2

plot_ly(
  data = df[(df$`Life expectancy` != 0) & (df$`Tuberculosis Incidence` != 0),],
  x = ~ df$`Tuberculosis Incidence`,
  y = ~ df$`Life expectancy`,
  color = df$continent) %>%
  layout(
    title = 'Отношение продолжительности жизни и заболеваемостью туберкулезом',
    yaxis = list(title = 'Ожидаемая продолжительность жизни',
                 zeroline = FALSE),  # Уберём выделения нулевых осей по y
    xaxis = list(title = 'Заболеваемость туберкулезом',
                 zeroline = FALSE)) # Уберём выделения нулевых осей по y
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

#Задание 3

newdf = df[df$continent %in% c('Africa', 'Americas'), ] #Новый дата-фрейм с выборкой только из Африки и Америки
ggqqplot(newdf[newdf$`Life expectancy`,], 
         x = "Life expectancy", facet.by = "continent")

ggplot(data = newdf, 
       aes(x = `Life expectancy`)) +
  geom_histogram()+
  facet_grid(newdf$continent)+
  theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

stat.test <- newdf %>% 
  wilcox_test(`Life expectancy` ~ continent) %>% 
  add_xy_position(x = "continent")
stat.test
## # A tibble: 1 × 11
##   .y.       group1 group2    n1    n2 statistic        p y.position groups  xmin
##   <chr>     <chr>  <chr>  <int> <int>     <dbl>    <dbl>      <dbl> <name> <dbl>
## 1 Life exp… Africa Ameri…    52    38       107 6.34e-13       86.6 <chr>      1
## # ℹ 1 more variable: xmax <dbl>
ggboxplot(
  newdf, 
  x = "continent", y = "Life expectancy", 
  ylab = "Life expectancy", xlab = "Continent", 
  add = "jitter"
  ) + 
  labs(subtitle = get_test_label(stat.test, detailed = TRUE)) + 
  stat_pvalue_manual(stat.test, tip.length = 0) 

#Задание 4

df_num = subset(df, select = -c(Country, Year, Gender, continent)) 
summary(df_num)
##  Life expectancy  Unemployment    Infant Mortality      GDP           
##  Min.   :55.49   Min.   : 0.178   Min.   : 1.40    Min.   :1.884e+08  
##  1st Qu.:70.02   1st Qu.: 3.735   1st Qu.: 5.35    1st Qu.:1.117e+10  
##  Median :77.55   Median : 5.960   Median :13.50    Median :3.967e+10  
##  Mean   :75.52   Mean   : 8.597   Mean   :19.61    Mean   :4.660e+11  
##  3rd Qu.:80.95   3rd Qu.:10.958   3rd Qu.:30.23    3rd Qu.:2.476e+11  
##  Max.   :88.10   Max.   :36.442   Max.   :75.80    Max.   :2.143e+13  
##       GNI            Clean fuels and cooking technologies   Per Capita      
##  Min.   :3.754e+08   Min.   :  0.00                       Min.   :   228.2  
##  1st Qu.:1.094e+10   1st Qu.: 34.50                       1st Qu.:  2165.3  
##  Median :4.009e+10   Median : 80.70                       Median :  6624.8  
##  Mean   :4.864e+11   Mean   : 65.98                       Mean   : 16821.0  
##  3rd Qu.:2.457e+11   3rd Qu.:100.00                       3rd Qu.: 19439.7  
##  Max.   :2.171e+13   Max.   :100.00                       Max.   :175813.9  
##  Mortality caused by road traffic injury Tuberculosis Incidence
##  Min.   : 0.00                           Min.   :  0.0         
##  1st Qu.: 8.20                           1st Qu.: 12.0         
##  Median :16.00                           Median : 46.0         
##  Mean   :17.06                           Mean   :103.8         
##  3rd Qu.:24.00                           3rd Qu.:138.5         
##  Max.   :64.60                           Max.   :654.0         
##  DPT Immunization HepB3 Immunization Measles Immunization Hospital beds   
##  Min.   :35.00    Min.   :35.00      Min.   :37.00        Min.   : 0.200  
##  1st Qu.:85.69    1st Qu.:81.31      1st Qu.:84.85        1st Qu.: 1.301  
##  Median :92.00    Median :91.00      Median :92.00        Median : 2.570  
##  Mean   :87.99    Mean   :86.76      Mean   :87.31        Mean   : 2.997  
##  3rd Qu.:97.00    3rd Qu.:96.00      3rd Qu.:96.50        3rd Qu.: 3.773  
##  Max.   :99.00    Max.   :99.00      Max.   :99.00        Max.   :13.710  
##  Basic sanitation services Tuberculosis treatment Urban population
##  Min.   :  8.632           Min.   :  0.00         Min.   : 13.25  
##  1st Qu.: 62.919           1st Qu.: 73.00         1st Qu.: 41.92  
##  Median : 91.144           Median : 82.00         Median : 58.76  
##  Mean   : 77.380           Mean   : 77.57         Mean   : 59.12  
##  3rd Qu.: 98.582           3rd Qu.: 88.00         3rd Qu.: 78.02  
##  Max.   :100.000           Max.   :100.00         Max.   :100.00  
##  Rural population Non-communicable Mortality  Sucide Rate    
##  Min.   : 0.00    Min.   : 4.40              Min.   : 0.300  
##  1st Qu.:21.98    1st Qu.:11.85              1st Qu.: 2.050  
##  Median :41.24    Median :17.20              Median : 3.500  
##  Mean   :40.88    Mean   :17.05              Mean   : 4.802  
##  3rd Qu.:58.08    3rd Qu.:22.10              3rd Qu.: 6.600  
##  Max.   :86.75    Max.   :43.70              Max.   :30.100
library(corrplot)
## Warning: пакет 'corrplot' был собран под R версии 4.3.2
## corrplot 0.92 loaded
df_cor <- cor(df_num)
df_cor
##                                         Life expectancy Unemployment
## Life expectancy                               1.0000000 -0.122453828
## Unemployment                                 -0.1224538  1.000000000
## Infant Mortality                             -0.8309072  0.103775270
## GDP                                           0.1688143 -0.111437568
## GNI                                           0.1786383 -0.109486971
## Clean fuels and cooking technologies          0.7637583  0.063975082
## Per Capita                                    0.6034817 -0.183778304
## Mortality caused by road traffic injury      -0.6518097  0.173386776
## Tuberculosis Incidence                       -0.5831370  0.121480015
## DPT Immunization                              0.5044753 -0.147098313
## HepB3 Immunization                            0.4023880 -0.102304627
## Measles Immunization                          0.5323483 -0.166019661
## Hospital beds                                 0.4849944 -0.147215966
## Basic sanitation services                     0.8515922  0.029209789
## Tuberculosis treatment                       -0.3214166 -0.051270213
## Urban population                              0.5829745  0.085895193
## Rural population                             -0.5829745 -0.085895193
## Non-communicable Mortality                   -0.6338148  0.131242386
## Sucide Rate                                   0.1593575  0.006558983
##                                         Infant Mortality         GDP
## Life expectancy                              -0.83090718  0.16881430
## Unemployment                                  0.10377527 -0.11143757
## Infant Mortality                              1.00000000 -0.16907590
## GDP                                          -0.16907590  1.00000000
## GNI                                          -0.16096299  0.99435959
## Clean fuels and cooking technologies         -0.77567408  0.13751753
## Per Capita                                   -0.31858469  0.17389277
## Mortality caused by road traffic injury       0.65050858 -0.11544108
## Tuberculosis Incidence                        0.56359507 -0.08695088
## DPT Immunization                             -0.59029923  0.10813790
## HepB3 Immunization                           -0.52710787  0.08719372
## Measles Immunization                         -0.58504641  0.10139187
## Hospital beds                                -0.52085961  0.13217297
## Basic sanitation services                    -0.77056506  0.14600318
## Tuberculosis treatment                        0.27989549 -0.02495462
## Urban population                             -0.47460671  0.15210771
## Rural population                              0.47460671 -0.15210771
## Non-communicable Mortality                    0.66661171 -0.19176139
## Sucide Rate                                   0.05980403  0.11409037
##                                                 GNI
## Life expectancy                          0.17863826
## Unemployment                            -0.10948697
## Infant Mortality                        -0.16096299
## GDP                                      0.99435959
## GNI                                      1.00000000
## Clean fuels and cooking technologies     0.13470096
## Per Capita                               0.18266349
## Mortality caused by road traffic injury -0.11353833
## Tuberculosis Incidence                  -0.08693997
## DPT Immunization                         0.10537313
## HepB3 Immunization                       0.08210454
## Measles Immunization                     0.09835173
## Hospital beds                            0.13351160
## Basic sanitation services                0.15141620
## Tuberculosis treatment                  -0.02115128
## Urban population                         0.16783616
## Rural population                        -0.16783616
## Non-communicable Mortality              -0.18266265
## Sucide Rate                              0.13051116
##                                         Clean fuels and cooking technologies
## Life expectancy                                                   0.76375825
## Unemployment                                                      0.06397508
## Infant Mortality                                                 -0.77567408
## GDP                                                               0.13751753
## GNI                                                               0.13470096
## Clean fuels and cooking technologies                              1.00000000
## Per Capita                                                        0.38753491
## Mortality caused by road traffic injury                          -0.59553317
## Tuberculosis Incidence                                           -0.54553537
## DPT Immunization                                                  0.44831815
## HepB3 Immunization                                                0.38243439
## Measles Immunization                                              0.49991332
## Hospital beds                                                     0.43564106
## Basic sanitation services                                         0.84316638
## Tuberculosis treatment                                           -0.31357924
## Urban population                                                  0.59435666
## Rural population                                                 -0.59435666
## Non-communicable Mortality                                       -0.65526759
## Sucide Rate                                                       0.00976739
##                                          Per Capita
## Life expectancy                          0.60348166
## Unemployment                            -0.18377830
## Infant Mortality                        -0.31858469
## GDP                                      0.17389277
## GNI                                      0.18266349
## Clean fuels and cooking technologies     0.38753491
## Per Capita                               1.00000000
## Mortality caused by road traffic injury -0.41639961
## Tuberculosis Incidence                  -0.30760893
## DPT Immunization                         0.21421912
## HepB3 Immunization                       0.09303434
## Measles Immunization                     0.21571852
## Hospital beds                            0.24966424
## Basic sanitation services                0.45409579
## Tuberculosis treatment                  -0.32644726
## Urban population                         0.42880232
## Rural population                        -0.42880232
## Non-communicable Mortality              -0.35531918
## Sucide Rate                              0.32281997
##                                         Mortality caused by road traffic injury
## Life expectancy                                                      -0.6518097
## Unemployment                                                          0.1733868
## Infant Mortality                                                      0.6505086
## GDP                                                                  -0.1154411
## GNI                                                                  -0.1135383
## Clean fuels and cooking technologies                                 -0.5955332
## Per Capita                                                           -0.4163996
## Mortality caused by road traffic injury                               1.0000000
## Tuberculosis Incidence                                                0.4123296
## DPT Immunization                                                     -0.3400658
## HepB3 Immunization                                                   -0.2628041
## Measles Immunization                                                 -0.3107449
## Hospital beds                                                        -0.4909601
## Basic sanitation services                                            -0.6320693
## Tuberculosis treatment                                                0.3130249
## Urban population                                                     -0.3718674
## Rural population                                                      0.3718674
## Non-communicable Mortality                                            0.4071462
## Sucide Rate                                                          -0.1102582
##                                         Tuberculosis Incidence DPT Immunization
## Life expectancy                                    -0.58313705       0.50447529
## Unemployment                                        0.12148001      -0.14709831
## Infant Mortality                                    0.56359507      -0.59029923
## GDP                                                -0.08695088       0.10813790
## GNI                                                -0.08693997       0.10537313
## Clean fuels and cooking technologies               -0.54553537       0.44831815
## Per Capita                                         -0.30760893       0.21421912
## Mortality caused by road traffic injury             0.41232959      -0.34006575
## Tuberculosis Incidence                              1.00000000      -0.37169763
## DPT Immunization                                   -0.37169763       1.00000000
## HepB3 Immunization                                 -0.31215616       0.94776877
## Measles Immunization                               -0.37364785       0.88078924
## Hospital beds                                      -0.19543396       0.32366629
## Basic sanitation services                          -0.55532307       0.45942955
## Tuberculosis treatment                              0.23672979      -0.13993470
## Urban population                                   -0.33622933       0.22057595
## Rural population                                    0.33622933      -0.22057595
## Non-communicable Mortality                          0.48089925      -0.38159200
## Sucide Rate                                         0.09858654       0.05567581
##                                         HepB3 Immunization Measles Immunization
## Life expectancy                                 0.40238797           0.53234834
## Unemployment                                   -0.10230463          -0.16601966
## Infant Mortality                               -0.52710787          -0.58504641
## GDP                                             0.08719372           0.10139187
## GNI                                             0.08210454           0.09835173
## Clean fuels and cooking technologies            0.38243439           0.49991332
## Per Capita                                      0.09303434           0.21571852
## Mortality caused by road traffic injury        -0.26280410          -0.31074490
## Tuberculosis Incidence                         -0.31215616          -0.37364785
## DPT Immunization                                0.94776877           0.88078924
## HepB3 Immunization                              1.00000000           0.86161432
## Measles Immunization                            0.86161432           1.00000000
## Hospital beds                                   0.27225503           0.33526203
## Basic sanitation services                       0.38112985           0.50904494
## Tuberculosis treatment                         -0.09250053          -0.14092951
## Urban population                                0.13692089           0.24604275
## Rural population                               -0.13692089          -0.24604275
## Non-communicable Mortality                     -0.31401541          -0.38626279
## Sucide Rate                                    -0.01978305           0.02560727
##                                         Hospital beds Basic sanitation services
## Life expectancy                             0.4849944                0.85159219
## Unemployment                               -0.1472160                0.02920979
## Infant Mortality                           -0.5208596               -0.77056506
## GDP                                         0.1321730                0.14600318
## GNI                                         0.1335116                0.15141620
## Clean fuels and cooking technologies        0.4356411                0.84316638
## Per Capita                                  0.2496642                0.45409579
## Mortality caused by road traffic injury    -0.4909601               -0.63206935
## Tuberculosis Incidence                     -0.1954340               -0.55532307
## DPT Immunization                            0.3236663                0.45942955
## HepB3 Immunization                          0.2722550                0.38112985
## Measles Immunization                        0.3352620                0.50904494
## Hospital beds                               1.0000000                0.47445249
## Basic sanitation services                   0.4744525                1.00000000
## Tuberculosis treatment                     -0.1947393               -0.30065649
## Urban population                            0.2740715                0.55069603
## Rural population                           -0.2740715               -0.55069603
## Non-communicable Mortality                 -0.3562093               -0.52254411
## Sucide Rate                                 0.2665261                0.15953741
##                                         Tuberculosis treatment Urban population
## Life expectancy                                    -0.32141658       0.58297452
## Unemployment                                       -0.05127021       0.08589519
## Infant Mortality                                    0.27989549      -0.47460671
## GDP                                                -0.02495462       0.15210771
## GNI                                                -0.02115128       0.16783616
## Clean fuels and cooking technologies               -0.31357924       0.59435666
## Per Capita                                         -0.32644726       0.42880232
## Mortality caused by road traffic injury             0.31302487      -0.37186744
## Tuberculosis Incidence                              0.23672979      -0.33622933
## DPT Immunization                                   -0.13993470       0.22057595
## HepB3 Immunization                                 -0.09250053       0.13692089
## Measles Immunization                               -0.14092951       0.24604275
## Hospital beds                                      -0.19473929       0.27407149
## Basic sanitation services                          -0.30065649       0.55069603
## Tuberculosis treatment                              1.00000000      -0.28393086
## Urban population                                   -0.28393086       1.00000000
## Rural population                                    0.28393086      -1.00000000
## Non-communicable Mortality                          0.26680379      -0.53028884
## Sucide Rate                                        -0.07289482       0.08936862
##                                         Rural population
## Life expectancy                              -0.58297452
## Unemployment                                 -0.08589519
## Infant Mortality                              0.47460671
## GDP                                          -0.15210771
## GNI                                          -0.16783616
## Clean fuels and cooking technologies         -0.59435666
## Per Capita                                   -0.42880232
## Mortality caused by road traffic injury       0.37186744
## Tuberculosis Incidence                        0.33622933
## DPT Immunization                             -0.22057595
## HepB3 Immunization                           -0.13692089
## Measles Immunization                         -0.24604275
## Hospital beds                                -0.27407149
## Basic sanitation services                    -0.55069603
## Tuberculosis treatment                        0.28393086
## Urban population                             -1.00000000
## Rural population                              1.00000000
## Non-communicable Mortality                    0.53028884
## Sucide Rate                                  -0.08936862
##                                         Non-communicable Mortality  Sucide Rate
## Life expectancy                                         -0.6338148  0.159357534
## Unemployment                                             0.1312424  0.006558983
## Infant Mortality                                         0.6666117  0.059804035
## GDP                                                     -0.1917614  0.114090369
## GNI                                                     -0.1826627  0.130511162
## Clean fuels and cooking technologies                    -0.6552676  0.009767390
## Per Capita                                              -0.3553192  0.322819969
## Mortality caused by road traffic injury                  0.4071462 -0.110258162
## Tuberculosis Incidence                                   0.4808992  0.098586543
## DPT Immunization                                        -0.3815920  0.055675815
## HepB3 Immunization                                      -0.3140154 -0.019783046
## Measles Immunization                                    -0.3862628  0.025607272
## Hospital beds                                           -0.3562093  0.266526138
## Basic sanitation services                               -0.5225441  0.159537407
## Tuberculosis treatment                                   0.2668038 -0.072894819
## Urban population                                        -0.5302888  0.089368619
## Rural population                                         0.5302888 -0.089368619
## Non-communicable Mortality                               1.0000000  0.184023972
## Sucide Rate                                              0.1840240  1.000000000

Визуализация корреляции №1

corrplot(df_cor, method = 'number', addCoef.col = 0.1, 
         number.cex = 0.5,
          tl.cex = 0.4)

Визуализация корреляции №2

testRes = cor.mtest(df_num, conf.level = 0.95)
corrplot(df_cor, p.mat = testRes$p, sig.level = 0.10, order = 'hclust', addrect = 2, number.cex = 0.5, tl.cex = 0.4)

corrplot(df_cor, p.mat = testRes$p, method = 'color', diag = FALSE, type = 'upper',
         sig.level = c(0.001, 0.01, 0.05), pch.cex = 0.9,
         insig = 'label_sig', pch.col = 'grey20', order = 'AOE', number.cex=0.5, tl.cex = 0.4)

#Задание 5

df_clear_scaled <- scale(df_num)
head(df_clear_scaled)
##      Life expectancy Unemployment Infant Mortality         GDP         GNI
## [1,]      -1.1887512   0.78408167       1.37520928 -0.23131589 -0.23883882
## [2,]       0.6085937   0.39076754      -0.70358937 -0.23307415 -0.24083177
## [3,]       0.3395059   1.43864925      -0.05987047 -0.15219202 -0.16296179
## [4,]      -1.4944027  -0.10922743       1.46970013 -0.19478831 -0.20674291
## [5,]       0.3357325  -0.04882006      -0.85713700 -0.24016716 -0.24779098
## [6,]       0.5817890   0.30129329      -0.74492912 -0.00727451 -0.02677993
##      Clean fuels and cooking technologies Per Capita
## [1,]                           -0.8254549 -0.6680197
## [2,]                            0.4052050 -0.4674731
## [3,]                            0.9172919 -0.5250000
## [4,]                           -0.4510260 -0.5732820
## [5,]                            0.9365639  0.0227355
## [6,]                            0.9310577 -0.2767667
##      Mortality caused by road traffic injury Tuberculosis Incidence
## [1,]                              -0.1116583              0.6348233
## [2,]                              -0.5166385             -0.6534901
## [3,]                               0.3704610             -0.3183797
## [4,]                               0.8718650              1.8412208
## [5,]                              -1.6447977             -0.7726405
## [6,]                              -0.2852213             -0.5566804
##      DPT Immunization HepB3 Immunization Measles Immunization Hospital beds
## [1,]       -1.7729743        -1.63130248           -1.7674223   -1.08432585
## [2,]        0.8879797         0.96139344            0.5828081    0.02352559
## [3,]        0.2428999         0.33286109           -0.5544002   -0.50598801
## [4,]       -2.4986890        -2.65266754           -2.7530027   -0.92881827
## [5,]        0.5654398         0.96139344            0.4311803   -0.17591613
## [6,]       -0.1602749        -0.05997162            0.5069942    0.68216505
##      Basic sanitation services Tuberculosis treatment Urban population
## [1,]                -1.0217399              0.7878907       -1.4325629
## [2,]                 0.7851429              0.6118986        0.0903225
## [3,]                 0.3154049              0.4945706        0.6037463
## [4,]                -0.9357671             -0.5027177        0.3027323
## [5,]                 0.2932369             -0.3110819       -1.4861375
## [6,]                 0.5051511             -1.7933262        1.4108863
##      Rural population Non-communicable Mortality Sucide Rate
## [1,]        1.4325629                 2.70651902  -0.3177247
## [2,]       -0.0903225                -1.56166057  -0.5555414
## [3,]       -0.6037463                -0.60061351  -0.7933581
## [4,]       -0.3027323                 0.33216746  -0.6612377
## [5,]        1.4861375                 0.07777265  -1.0575990
## [6,]       -1.4108863                -0.69954482  -0.3969969
df_clear_dist <- dist(df_clear_scaled, 
                        method = "euclidean"
                        )

df_clear_hc <- hclust(d = df_clear_dist, 
                        method = "ward.D2")

#Задание 6

library(factoextra)
## Warning: пакет 'factoextra' был собран под R версии 4.3.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_dend(df_clear_hc, 
          cex = 0.1) # cex() - размер лейблов
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

library(pheatmap)
## Warning: пакет 'pheatmap' был собран под R версии 4.3.2
pheatmap(df_clear_scaled, 
         show_rownames = FALSE, 
         clustering_distance_rows = df_clear_dist,
         clustering_method = "ward.D2", 
         cutree_rows = 5,
         cutree_cols = length(colnames(df_clear_scaled)),
         angle_col = 45, 
         main = "Dendrograms for clustering rows and columns with heatmap")

Пояснение к заданию 6: по горизонтали расположены все переменные дата-фрейма, по вертикали каждая из 195 стран, высота слияния древа указывает на различие между двумя объектами/кластерами: чем выше высота слияния, тем менее похожи объекты. Мы проделали сначала нормализацию всех величин дата-фрейма, затем посчитали расстояние между каждыми значениями и создали матрицу с расстоянием. Далее Ward построили иерархическое древо по методу Ward. При построении хитмапа мы указали 5 рядов кластеризации. Визуально в первом сверху кластере можно увидеть страны с низкими показателями иммунизации от кори, гепатита и АКДС. В третьем кластере сверху выделяются страны с высокими значениями ВВП и ВНД. Также в первом ряду выбивается кластер по колонке младенческой смертности.

#Задание 7

library(FactoMineR)
## Warning: пакет 'FactoMineR' был собран под R версии 4.3.2
library(factoextra)
df_full.pca <- prcomp(df_num, 
                        scale = T)
summary(df_full.pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6    PC7
## Standard deviation     2.7526 1.4841 1.3952 1.17177 1.08375 0.96347 0.9288
## Proportion of Variance 0.3988 0.1159 0.1025 0.07227 0.06182 0.04886 0.0454
## Cumulative Proportion  0.3988 0.5147 0.6172 0.68945 0.75126 0.80012 0.8455
##                            PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.85740 0.69263 0.68937 0.59106 0.54986 0.47085 0.36596
## Proportion of Variance 0.03869 0.02525 0.02501 0.01839 0.01591 0.01167 0.00705
## Cumulative Proportion  0.88421 0.90946 0.93447 0.95286 0.96877 0.98044 0.98749
##                           PC15    PC16    PC17    PC18      PC19
## Standard deviation     0.34546 0.26941 0.20224 0.06968 1.017e-15
## Proportion of Variance 0.00628 0.00382 0.00215 0.00026 0.000e+00
## Cumulative Proportion  0.99377 0.99759 0.99974 1.00000 1.000e+00
fviz_eig(df_full.pca, addlabels = T, ylim = c(0, 40))

fviz_pca_var(df_full.pca, col.var = "contrib")

fviz_pca_var(df_full.pca, 
             select.var = list(contrib = 3), # Задаём число здесь 
             col.var = "contrib")

fviz_contrib(df_full.pca, choice = "var", axes = 1, top = 24) # 1

fviz_contrib(df_full.pca, choice = "var", axes = 2, top = 24) # 2

fviz_contrib(df_full.pca, choice = "var", axes = 3, top = 24) # 3

Пояснение к заданию 7: первая компонента описывает почти 40% дисперсии. Первые две компоненты объясняют 50% дисперсии. На втором графике явно выделяются только четыре группы переменных: 1 группа - переменные, относящиеся к иммунизации (Measles, DPT, HepB3 Immunization), 2 группа - Hospital Beds, Basic Sanitation Services, Life expectancy, Clean fuels and cooking technologies, 3 группа - Urban population, Per Capita, GDP, GNI, Suicide Rate, 4 группа - другие (rural population, tuberculosis treatment, Inf Mort, Tuberculosis incidence, road injury mortality, Non-communicable mortality). Основная проблема: много переменных, которые не коррелируют друг с другом.

#Задание 8

library(ggbiplot)
## Загрузка требуемого пакета: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Присоединяю пакет: 'plyr'
## Следующие объекты скрыты от 'package:rstatix':
## 
##     desc, mutate
## Следующие объекты скрыты от 'package:plotly':
## 
##     arrange, mutate, rename, summarise
## Следующий объект скрыт от 'package:ggpubr':
## 
##     mutate
## Следующие объекты скрыты от 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## Загрузка требуемого пакета: scales
## Загрузка требуемого пакета: grid
ggbiplot(df_full.pca, 
         scale=0, alpha = 0.1) + 
  theme_minimal()

df_num_with_ch = subset(df, select = -c(Country, Year, Gender)) 
summary(df_num_with_ch)
##  Life expectancy  Unemployment    Infant Mortality      GDP           
##  Min.   :55.49   Min.   : 0.178   Min.   : 1.40    Min.   :1.884e+08  
##  1st Qu.:70.02   1st Qu.: 3.735   1st Qu.: 5.35    1st Qu.:1.117e+10  
##  Median :77.55   Median : 5.960   Median :13.50    Median :3.967e+10  
##  Mean   :75.52   Mean   : 8.597   Mean   :19.61    Mean   :4.660e+11  
##  3rd Qu.:80.95   3rd Qu.:10.958   3rd Qu.:30.23    3rd Qu.:2.476e+11  
##  Max.   :88.10   Max.   :36.442   Max.   :75.80    Max.   :2.143e+13  
##       GNI            Clean fuels and cooking technologies   Per Capita      
##  Min.   :3.754e+08   Min.   :  0.00                       Min.   :   228.2  
##  1st Qu.:1.094e+10   1st Qu.: 34.50                       1st Qu.:  2165.3  
##  Median :4.009e+10   Median : 80.70                       Median :  6624.8  
##  Mean   :4.864e+11   Mean   : 65.98                       Mean   : 16821.0  
##  3rd Qu.:2.457e+11   3rd Qu.:100.00                       3rd Qu.: 19439.7  
##  Max.   :2.171e+13   Max.   :100.00                       Max.   :175813.9  
##  Mortality caused by road traffic injury Tuberculosis Incidence
##  Min.   : 0.00                           Min.   :  0.0         
##  1st Qu.: 8.20                           1st Qu.: 12.0         
##  Median :16.00                           Median : 46.0         
##  Mean   :17.06                           Mean   :103.8         
##  3rd Qu.:24.00                           3rd Qu.:138.5         
##  Max.   :64.60                           Max.   :654.0         
##  DPT Immunization HepB3 Immunization Measles Immunization Hospital beds   
##  Min.   :35.00    Min.   :35.00      Min.   :37.00        Min.   : 0.200  
##  1st Qu.:85.69    1st Qu.:81.31      1st Qu.:84.85        1st Qu.: 1.301  
##  Median :92.00    Median :91.00      Median :92.00        Median : 2.570  
##  Mean   :87.99    Mean   :86.76      Mean   :87.31        Mean   : 2.997  
##  3rd Qu.:97.00    3rd Qu.:96.00      3rd Qu.:96.50        3rd Qu.: 3.773  
##  Max.   :99.00    Max.   :99.00      Max.   :99.00        Max.   :13.710  
##  Basic sanitation services Tuberculosis treatment Urban population
##  Min.   :  8.632           Min.   :  0.00         Min.   : 13.25  
##  1st Qu.: 62.919           1st Qu.: 73.00         1st Qu.: 41.92  
##  Median : 91.144           Median : 82.00         Median : 58.76  
##  Mean   : 77.380           Mean   : 77.57         Mean   : 59.12  
##  3rd Qu.: 98.582           3rd Qu.: 88.00         3rd Qu.: 78.02  
##  Max.   :100.000           Max.   :100.00         Max.   :100.00  
##  Rural population Non-communicable Mortality  Sucide Rate        continent 
##  Min.   : 0.00    Min.   : 4.40              Min.   : 0.300   Africa  :52  
##  1st Qu.:21.98    1st Qu.:11.85              1st Qu.: 2.050   Americas:38  
##  Median :41.24    Median :17.20              Median : 3.500   Asia    :42  
##  Mean   :40.88    Mean   :17.05              Mean   : 4.802   Europe  :48  
##  3rd Qu.:58.08    3rd Qu.:22.10              3rd Qu.: 6.600   Oceania :15  
##  Max.   :86.75    Max.   :43.70              Max.   :30.100
plot = ggbiplot(df_full.pca, 
         scale=0, 
         groups = as.factor(df_num_with_ch$continent), 
         ellipse = T,
         alpha = 0.2) +
  theme_minimal()

ggplotly(plot)

#Задание 9 Интерпретация РСА анализа: большое количество переменных затрудняет интерпретацию. Видно, что в Европе наибольший вес оказывают переменные: Hospital Beds, Basic Sanitation Services, Life expectancy, Clean fuels and cooking technologies. Америка и Азия перекрывают друг друга больше остальных контитнентов. В Африке наибольший вес оказывают переменные: rural population, tuberculosis treatment, Inf Mort, Tuberculosis incidence, road injury mortality, Non-communicable mortality. Можно сделать вывод, что в Африке больше сельского населения, выше младенческая смертность, заболеваемость туберкулезом, смертность в ДТП и т.д. В Европе наоборот выше продолжительность жизни, уровень санитарии и т.д.

#Задание 10

#Задание 11

#PCA1
df_pca1 = subset(df, select = -c(Country, Year, Gender, continent, GDP, GNI, `Life expectancy`,`HepB3 Immunization`, `Measles Immunization`)) 

pca1 <- prcomp(df_pca1, 
                        scale = T)
summary(pca1)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.4484 1.2142 1.1844 1.00213 0.94281 0.85779 0.80194
## Proportion of Variance 0.4282 0.1053 0.1002 0.07173 0.06349 0.05256 0.04594
## Cumulative Proportion  0.4282 0.5335 0.6337 0.70542 0.76891 0.82147 0.86741
##                            PC8     PC9    PC10    PC11    PC12    PC13
## Standard deviation     0.68575 0.68504 0.58175 0.53716 0.41185 0.34666
## Proportion of Variance 0.03359 0.03352 0.02417 0.02061 0.01212 0.00858
## Cumulative Proportion  0.90100 0.93452 0.95869 0.97930 0.99142 1.00000
##                             PC14
## Standard deviation     5.007e-16
## Proportion of Variance 0.000e+00
## Cumulative Proportion  1.000e+00
fviz_eig(pca1, addlabels = T, ylim = c(0, 40))

fviz_pca_var(pca1, col.var = "contrib")

fviz_pca_var(pca1, 
             select.var = list(contrib = 3), # Задаём число здесь 
             col.var = "contrib")

fviz_contrib(pca1, choice = "var", axes = 1, top = 24) # 1

fviz_contrib(pca1, choice = "var", axes = 2, top = 24) # 2

fviz_contrib(pca1, choice = "var", axes = 3, top = 24) # 3

ggbiplot(pca1, 
         scale=0, alpha = 0.1) + 
  theme_minimal()

#PCA2
df_pca2 = subset(df_pca1, select = -c(`DPT Immunization`, `Unemployment`, `Infant Mortality`, `Per Capita`, `Hospital beds`)) 

pca2 <- prcomp(df_pca2, 
                        scale = T)
summary(pca2)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.1144 1.0877 1.0216 0.90733 0.73692 0.66282 0.60591
## Proportion of Variance 0.4967 0.1315 0.1159 0.09147 0.06034 0.04881 0.04079
## Cumulative Proportion  0.4967 0.6282 0.7441 0.83562 0.89595 0.94477 0.98556
##                            PC8       PC9
## Standard deviation     0.36049 8.036e-16
## Proportion of Variance 0.01444 0.000e+00
## Cumulative Proportion  1.00000 1.000e+00
fviz_eig(pca2, addlabels = T, ylim = c(0, 40))

fviz_pca_var(pca2, col.var = "contrib")

fviz_pca_var(pca2, 
             select.var = list(contrib = 3), # Задаём число здесь 
             col.var = "contrib")

fviz_contrib(pca1, choice = "var", axes = 1, top = 24) # 1

fviz_contrib(pca1, choice = "var", axes = 2, top = 24) # 2

fviz_contrib(pca1, choice = "var", axes = 3, top = 24) # 3

ggbiplot(pca2, 
         scale=0, alpha = 0.1) + 
  theme_minimal()

#PCA3
df_pca3 = subset(df_pca2, select = -c(`Sucide Rate`, `Rural population`, `Urban population`, `Tuberculosis Incidence`, `Tuberculosis treatment`)) 

pca3 <- prcomp(df_pca3, 
                        scale = T)
summary(pca3)
## Importance of components:
##                          PC1    PC2    PC3     PC4
## Standard deviation     1.688 0.7817 0.6361 0.36929
## Proportion of Variance 0.712 0.1528 0.1011 0.03409
## Cumulative Proportion  0.712 0.8648 0.9659 1.00000
fviz_eig(pca3, addlabels = T, ylim = c(0, 40))

fviz_pca_var(pca3, col.var = "contrib")

fviz_pca_var(pca3, 
             select.var = list(contrib = 3), # Задаём число здесь 
             col.var = "contrib")

fviz_contrib(pca1, choice = "var", axes = 1, top = 24) # 1

fviz_contrib(pca1, choice = "var", axes = 2, top = 24) # 2

fviz_contrib(pca1, choice = "var", axes = 3, top = 24) # 3

ggbiplot(pca3, 
         scale=0, alpha = 0.1) + 
  theme_minimal()

Пояснение к заданию 11: кумулятивный процент объясненной вариации повышается при каждом повторе эксперимента. К третьему повтору первая главная компонента объясняет 71 процент вариации.